import polars as pl
import pathlib
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from itables import init_notebook_mode
pio.renderers.default = 'plotly_mimetype+notebook'
init_notebook_mode(all_interactive=True)
path_data = pathlib.Path.cwd().parent / 'data'
path_input = path_data / 'input'
path_output = path_data / 'output'
path_df = path_output / 'HY1999-2016_20230713_P2A0.csv'
df = pl.read_csv(str(path_df), infer_schema_length=6000)
df.head()
| HarvestYear | ID2 | Longitude | Latitude | SampleID | Crop | GrainSampleArea_P1 | GrainMassWet_P1 | GrainMassWetInGrainSample_P1 | GrainMassWetInGrainSample_P2 | GrainMassOvenDryInGrainSample_P1 | GrainMassOvenDryInGrainSample_P2 | GrainMassAirDry_P1 | GrainMassOvenDry_P1 | GrainMassOvenDry_P2 | GrainMoistureProportion_P2 | GrainMoisture_P1 | GrainProtein_P1 | GrainStarch_P1 | GrainWGlutDM_P1 | GrainOilDM_P1 | GrainTestWeight_P1 | GrainCarbon_P1 | GrainNitrogen_P1 | GrainSulfur_P1 | BiomassSampleArea_P1 | BiomassWet_P1 | BiomassAirDry_P1 | GrainMassWetInBiomassSample_P1 | GrainMassOvenDryInBiomassSample_P1 | GrainMassOvenDryInBiomassSample_P2 | ResidueMassWetSubsample_P1 | ResidueMassOvenDrySubsample_P1 | ResidueMoistureProportionSubsample_P2 | ResidueCarbon_P1 | ResidueNitrogen_P1 | ResidueSulfur_P1 | Comments_P1 | GrainYieldWet_P2 | GrainYieldAirDry_P2 | GrainYieldOvenDry_P2 | ResidueMassWet_P2 | ResidueMassOvenDry_P2 | ResidueMassWetPerArea_P2 | ResidueMassOvenDryPerArea_P2 | ResidueMassAirDry_P2 | ResidueMassAirDryPerArea_P2 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Loading... (need help?) |
Here's a big'o'table of mean values for all the measurement columns. Means were calculated per year and per crop. Some means may not make sense to calculate, so use your discretion.
non_metric_cols = ['HarvestYear', 'ID2', 'Longitude', 'Latitude', 'SampleID', 'Crop']
metric_cols = [x for x in df.columns if x not in non_metric_cols]
df_agg_yr = (
df.groupby(['HarvestYear', 'Crop'])
.agg(
[pl.col(f'{c}').mean().alias(f'{c}_mean') for c in metric_cols]
)
.sort(['Crop', 'HarvestYear'])
)
display(df_agg_yr)
| HarvestYear | Crop | GrainSampleArea_P1_mean | GrainMassWet_P1_mean | GrainMassWetInGrainSample_P1_mean | GrainMassWetInGrainSample_P2_mean | GrainMassOvenDryInGrainSample_P1_mean | GrainMassOvenDryInGrainSample_P2_mean | GrainMassAirDry_P1_mean | GrainMassOvenDry_P1_mean | GrainMassOvenDry_P2_mean | GrainMoistureProportion_P2_mean | GrainMoisture_P1_mean | GrainProtein_P1_mean | GrainStarch_P1_mean | GrainWGlutDM_P1_mean | GrainOilDM_P1_mean | GrainTestWeight_P1_mean | GrainCarbon_P1_mean | GrainNitrogen_P1_mean | GrainSulfur_P1_mean | BiomassSampleArea_P1_mean | BiomassWet_P1_mean | BiomassAirDry_P1_mean | GrainMassWetInBiomassSample_P1_mean | GrainMassOvenDryInBiomassSample_P1_mean | GrainMassOvenDryInBiomassSample_P2_mean | ResidueMassWetSubsample_P1_mean | ResidueMassOvenDrySubsample_P1_mean | ResidueMoistureProportionSubsample_P2_mean | ResidueCarbon_P1_mean | ResidueNitrogen_P1_mean | ResidueSulfur_P1_mean | Comments_P1_mean | GrainYieldWet_P2_mean | GrainYieldAirDry_P2_mean | GrainYieldOvenDry_P2_mean | ResidueMassWet_P2_mean | ResidueMassOvenDry_P2_mean | ResidueMassWetPerArea_P2_mean | ResidueMassOvenDryPerArea_P2_mean | ResidueMassAirDry_P2_mean | ResidueMassAirDryPerArea_P2_mean |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Loading... (need help?) |
def create_yield_comparison_plot(crop, grain_or_residue):
# init with 'grain' values
wet_var = 'GrainYieldWet_P2'
air_var = 'GrainYieldAirDry_P2'
oven_var = 'GrainYieldOvenDry_P2'
if(grain_or_residue == 'residue'):
wet_var = 'ResidueMassWetPerArea_P2'
air_var = 'ResidueMassAirDryPerArea_P2'
oven_var = 'ResidueMassOvenDryPerArea_P2'
fig = go.Figure()
crop_df = df.filter(pl.col('Crop') == crop)
x = crop_df['HarvestYear'].to_list()
# Start/End cutoffs for years were methods and/or data were different
cutoff_a0 = 1999
cutoff_a1 = 2010
cutoff_b0 = 2011
cutoff_b1 = 2012
cutoff_c0 = 2013
cutoff_c1 = 2016
# Wet averages
wet_mean_1999_2010 = crop_df.filter(pl.col('HarvestYear') <= cutoff_a1)[wet_var].mean()
wet_mean_2011_2012 = crop_df.filter((pl.col('HarvestYear') > cutoff_a1) & (pl.col('HarvestYear') <= cutoff_b1))[wet_var].mean()
wet_mean_2013_2016 = crop_df.filter(pl.col('HarvestYear') > cutoff_b1)[wet_var].mean()
# Air dry averages
air_mean_1999_2010 = crop_df.filter(pl.col('HarvestYear') <= cutoff_a1)[air_var].mean()
air_mean_2011_2012 = crop_df.filter((pl.col('HarvestYear') > cutoff_a1) & (pl.col('HarvestYear') <= cutoff_b1))[air_var].mean()
air_mean_2013_2016 = crop_df.filter(pl.col('HarvestYear') > cutoff_b1)[air_var].mean()
# Oven dry averages
oven_mean_1999_2010 = crop_df.filter(pl.col('HarvestYear') <= cutoff_a1)[oven_var].mean()
oven_mean_2011_2012 = crop_df.filter((pl.col('HarvestYear') > cutoff_a1) & (pl.col('HarvestYear') <= cutoff_b1))[oven_var].mean()
oven_mean_2013_2016 = crop_df.filter(pl.col('HarvestYear') > cutoff_b1)[oven_var].mean()
# Box plots
fig.add_trace(go.Box(
x = x,
y = crop_df[wet_var].to_list(),
name = 'Wet',
marker_color='blue'
))
fig.add_trace(go.Box(
x = x,
y = crop_df[air_var].to_list(),
name='Air',
marker_color='green'
))
fig.add_trace(go.Box(
x = x,
y = crop_df[oven_var].to_list(),
name='Oven',
marker_color = 'red'
))
# Title, xaxis, add dashed v lines
fig.update_layout(
boxmode='group',
title={'text':crop,
'y':0.9,
'x':0.5,
'xanchor':'center',
'yanchor':'top'},
xaxis=dict(range=[1998,2017]))
fig.add_vline(x = cutoff_a1 + 0.5, line_width = 3, line_dash='dash')
fig.add_vline(x = cutoff_b1 + 0.5, line_width = 3, line_dash='dash')
#Wet averages
fig.add_shape(type='line',
x0=cutoff_a0,
x1=cutoff_a1,
y0=wet_mean_1999_2010,
y1=wet_mean_1999_2010,
xref='x',
yref='y',
line = dict(color='Blue'))
fig.add_shape(type='line',
x0=cutoff_b0,
x1=cutoff_b1,
y0=wet_mean_2011_2012,
y1=wet_mean_2011_2012,
xref='x',
yref='y',
line = dict(color='Blue'))
fig.add_shape(type='line',
x0=cutoff_c0,
x1=cutoff_c1,
y0=wet_mean_2013_2016,
y1=wet_mean_2013_2016,
xref='x',
yref='y',
line = dict(color='Blue'))
#Wet averages
fig.add_shape(type='line',
x0=cutoff_a0,
x1=cutoff_a1,
y0=air_mean_1999_2010,
y1=air_mean_1999_2010,
xref='x',
yref='y',
line = dict(color='Green'))
fig.add_shape(type='line',
x0=cutoff_b0,
x1=cutoff_b1,
y0=air_mean_2011_2012,
y1=air_mean_2011_2012,
xref='x',
yref='y',
line = dict(color='Green'))
fig.add_shape(type='line',
x0=cutoff_c0,
x1=cutoff_c1,
y0=air_mean_2013_2016,
y1=air_mean_2013_2016,
xref='x',
yref='y',
line = dict(color='Green'))
#Oven averages lines
fig.add_shape(type='line',
x0=cutoff_a0,
x1=cutoff_a1,
y0=oven_mean_1999_2010,
y1=oven_mean_1999_2010,
xref='x',
yref='y',
line = dict(color='Red'))
fig.add_shape(type='line',
x0=cutoff_b0,
x1=cutoff_b1,
y0=oven_mean_2011_2012,
y1=oven_mean_2011_2012,
xref='x',
yref='y',
line = dict(color='Red'))
fig.add_shape(type='line',
x0=cutoff_c0,
x1=cutoff_c1,
y0=oven_mean_2013_2016,
y1=oven_mean_2013_2016,
xref='x',
yref='y',
line = dict(color='Red'))
fig.show()
crops = list(set(df['Crop'].to_list()))
The following figures show box plots of each crop across the years.
Vertical dashed lines indicate a change of methods (and/or available data) from the previous year to the next. That is, the way the biomass and/or grain was dried, and what data were recorded, changed.
The horizontal lines indicate mean values for the years within the vertical lines. E.g. In the below image, the blue horizontal line that spans the years 1999 - 2009 has the y-value of the means value for wet grain yield (g/m2).
The point of these figures is to get a feel of the variability of the data across years and between measurement methods.
(oh, and ignore AL, it's all funky)
for crop in crops:
create_yield_comparison_plot(crop, 'grain')
for crop in crops:
create_yield_comparison_plot(crop, 'residue')